None Ch5
In [1]:
from IPython.display import display, HTML

# Set the notebook width to 80%
display(HTML("<style>.container { width: 80% !important; }</style>"))
In [2]:
!jupyter notebook list
Currently running servers:
http://localhost:2831/ :: /home/kzy816
In [3]:
# Needs to paste `http://localhost:3110`, no ending `/`
port = 2831

import IPython
import json
import requests

hostname = !hostname

# Get the current Jupyter server's info
result = !jupyter notebook list
for res in result:
    if f'http://localhost:{port}/' in res:
        result = res.split(' :: ')[0]
        break

# Print the server URL
print(f'Current Jupyter server {hostname} URL: {result}')

# Get the list of running notebooks
response = requests.get(f'{result}api/sessions')

# # Convert the JSON data to a string and print it
# print(json.dumps(response.json(), indent=4))

nbs = response.json()
nb_names = [nb['name'] for nb in nbs]
print(len(nb_names), nb_names)
Current Jupyter server ['qnode2181'] URL: http://localhost:2831/
1 ['Ch5-40d3f2bd-f2d0-46d4-94fb-22a9d809ba0d.ipynb']

Import dep

In [4]:
import logging

LOGGING_FORMAT = "%(asctime)s|(%(pathname)s)[%(lineno)d]: %(message)s"

logging.basicConfig(format=LOGGING_FORMAT)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

from itertools import product
import math
import time
from joblib import Parallel, delayed, parallel_backend

import numpy as np
import matplotlib.pyplot as plt
%matplotlib notebook
In [5]:
import plotly
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import plotly.io as pio

pio.renderers.default = "notebook"
In [6]:
COLOR_LIST = plotly.colors.DEFAULT_PLOTLY_COLORS
len(COLOR_LIST)
Out[6]:
10

Example 5.1: Blackjack fixed policy

  • Episodic Monte Carlo for policy evaluation.
  • Evaluate a fixed policy from the dealer and player.
In [7]:
# # ChatGPT version
# def Monte_Carlo_sim_blackjack_0(n_episodes=1000000):
#     """
#     Monte Carlo simulation for Blackjack game.
    
#     Parameters
#     ----------
#     n_episodes : int
#         Number of episodes to simulate.
        
#     Returns
#     -------
#     Q : dict
#         Dictionary of state-action values.
#     N : dict
#         Dictionary of state-action visit counts.
#     """
#     # Initialize dictionaries of state-action values and visit counts
#     Q = {}
#     N = {}
    
#     # Loop over episodes
#     for i in range(n_episodes):
#         # Initialize an empty list to store state-action pairs
#         episode = []
        
#         # Initialize the state
#         state = (np.random.randint(12, 22), np.random.randint(1, 11), False)
        
#         # Loop over steps in the episode
#         while True:
#             # If the state is not in the dictionary, add it
#             if state not in Q:
#                 Q[state] = {}
#                 N[state] = {}
#                 for action in range(2):
#                     Q[state][action] = 0
#                     N[state][action] = 0
            
#             # Choose an action
#             action = np.random.randint(2)
            
#             # Append the state-action pair to the episode
#             episode.append((state, action))
            
#             # Increment the visit count for the state-action pair
#             N[state][action] += 1
            
#             # Take the action
#             player_sum, dealer_card, usable_ace = state
#             if action == 1:
#                 player_sum += np.random.randint(1, 11)
#                 if player_sum > 21:
#                     if usable_ace:
#                         player_sum -= 10
#                         usable_ace = False
#                     else:
#                         break
#             else:
#                 break
            
#             # Update the state
#             state = (player_sum, dealer_card, usable_ace)
        
#         # Loop over state-action pairs in the episode
#         for state, action in episode:
#             # Calculate the return
#             G = 1 if state[0] > 21 else 1.5 if state[0] == 21 else 0
#             # Update the state-action value
#             Q[state][action] += (G - Q[state][action]) / N[state][action]
    
#     return Q, N
    
In [8]:
N_JOBS = 20
ALPHA = 0.8
CUST_JET = [
    [0.0, f'rgba(0, 0, 131, {ALPHA})'],    # Dark blue, more transparent
    [0.11, f'rgba(0, 60, 170, {ALPHA})'],  # Blue
    [0.22, f'rgba(5, 255, 255, {ALPHA})'], # Cyan
    [0.33, f'rgba(255, 255, 0, {ALPHA})'], # Yellow
    [0.44, f'rgba(250, 0, 0, {ALPHA})'],   # Red
    [0.55, f'rgba(128, 0, 0, {ALPHA})'],   # Dark red
    [1.0, f'rgba(128, 0, 0, {ALPHA})']     # Dark red, same as above to end
]
# COLORSCALE = CUST_JET
COLORSCALE = "Jet"

def organize_state_val(state_val_pair):
    
    def _update_state_val(state, val):
        o_val, o_cnt = state_val[state]
        state_val[state] = (o_val+(val-o_val)/(o_cnt+1), o_cnt+1)
        
    # Initial state values and cnts
    state_val = {}
    for i in range(1, 11):
        for j in range(12, 22):
            state_val[i, j, 1] = (0, 0) # usable ace
            state_val[i, j, 0] = (0, 0) # no usable ace
    
    for state, val in state_val_pair:
        _update_state_val(state, val)
        
    state_val_0 = np.zeros((10, 10))
    state_val_cnt_0 = np.zeros((10, 10), dtype=int)
    state_val_1 = np.zeros((10, 10))
    state_val_cnt_1 = np.zeros((10, 10), dtype=int)
    for i in range(1, 11):
        for j in range(12, 22):
            state_val_0[i-1, j-12] = state_val[i, j, 0][0]
            state_val_cnt_0[i-1, j-12] = state_val[i, j, 0][1]
            state_val_1[i-1, j-12] = state_val[i, j, 1][0]
            state_val_cnt_1[i-1, j-12] = state_val[i, j, 1][1]
        
    return np.array(state_val_0), np.array(state_val_1), np.array(state_val_cnt_0), np.array(state_val_cnt_1)
    

def Monte_Carlo_sim_blackjack_1(n_ep, dealer_thre=17, player_thre=20, n_jobs=1, verbose=0):
    n_suit = 13
    card_suit = range(1, 1+n_suit)
    
    def _ini_card_val(card):
        return int(min(card, 10)+10*(card==1))
    
    def _card_val(card):
        return int(min(card, 10))
    
    def _hits_or_sticks_round(player_cards, stick_thre):
        # Initial dealing
        player_sum = sum(_ini_card_val(card) for card in player_cards)
        usable_ace = int(np.any([card==1 for card in player_cards]))
        if usable_ace:
            if player_sum>21:
                player_sum -= 10
        else:
            while player_sum<12:
                new_card = np.random.choice(card_suit, 1)[0]
                if new_card==1:
                    if player_sum>=11:
                        player_sum += 1
                    else:
                        player_sum += 11
                        usable_ace = 1
                else:
                    player_sum += _card_val(new_card)
                    
        assert player_sum<=21, "Currently sum should <= 21"
        initial_sum = player_sum
        
        # Hits or sticks
        can_use_ace = usable_ace
        while player_sum<stick_thre:
            player_sum += _card_val(np.random.choice(card_suit, 1)[0])
            if player_sum>21 and can_use_ace:
                player_sum -= 10
                can_use_ace = 0
                
        return initial_sum, usable_ace, player_sum
    
    def _one_episode():
        # Initial dealing
        dealer_show = _card_val(np.random.choice(card_suit, 1)[0])
        
        player_cards = list(np.random.choice(card_suit, 2))
        player_sum, usable_ace, final_player_sum = _hits_or_sticks_round(player_cards, player_thre)
        
        state = (dealer_show, player_sum, usable_ace)
        if final_player_sum>21:
            reward = -1
        else:
            dealer_sum = _ini_card_val(dealer_show)
            had_ace = d_usable_ace = int(dealer_show==1)
            while dealer_sum<dealer_thre:
                new_card = np.random.choice(card_suit, 1)[0]
                if had_ace:
                    dealer_sum += _card_val(new_card)
                else:
                    dealer_sum += _ini_card_val(new_card)
                    if new_card==1:
                        had_ace = d_usable_ace = 1
                if dealer_sum>21 and d_usable_ace:
                    dealer_sum -= 10
                    d_usable_ace = 0
                if dealer_sum>final_player_sum:
                    break
            if dealer_sum>21:
                reward = 1
            else:
                reward = np.sign(final_player_sum-dealer_sum)
            
        return state, reward
    
    with parallel_backend('loky', n_jobs=n_jobs):
        res = Parallel(verbose=verbose, pre_dispatch="1.5*n_jobs")(
            delayed(_one_episode)() for _ in range(n_ep)
        )
    
    return organize_state_val(res)


# We don't care about how we reach the state, just start with the states
def Monte_Carlo_sim_blackjack_2(n_ep, dealer_thre=17, player_thre=20, n_jobs=1, verbose=0):
    n_suit = 13
    card_suit = range(1, 1+n_suit)
    
    def _ini_card_val(card):
        return int(min(card, 10)+10*(card==1))
    
    def _card_val(card):
        return int(min(card, 10))
    
    def _hits_or_sticks_round(state, stick_thre):
        _, player_sum, usable_ace = state
        # Hits or sticks
        while player_sum<stick_thre:
            player_sum += _card_val(np.random.choice(card_suit, 1)[0])
            if player_sum>21 and usable_ace:
                player_sum -= 10
                usable_ace = 0
                
        return player_sum
    
    def _one_episode():
        state = (
            np.random.choice(range(1, 11), 1)[0],
            np.random.choice(range(12, 22), 1)[0],
            np.random.choice(range(2), 1)[0]
        )
        
        dealer_show = state[0]
        
        final_player_sum = _hits_or_sticks_round(state, player_thre)
        
        if final_player_sum>21:
            reward = -1
        else:
            dealer_sum = _ini_card_val(dealer_show)
            had_ace = d_usable_ace = int(dealer_show==1)
            while dealer_sum<dealer_thre:
                new_card = np.random.choice(card_suit, 1)[0]
                if had_ace:
                    dealer_sum += _card_val(new_card)
                else:
                    dealer_sum += _ini_card_val(new_card)
                    if new_card==1:
                        had_ace = d_usable_ace = 1
                if dealer_sum>21 and d_usable_ace:
                    dealer_sum -= 10
                    d_usable_ace = 0
                if dealer_sum>final_player_sum:
                    break
            if dealer_sum>21:
                reward = 1
            else:
                reward = np.sign(final_player_sum-dealer_sum)
        
        return state, reward
    
    with parallel_backend('loky', n_jobs=n_jobs):
        res = Parallel(verbose=verbose, pre_dispatch="1.5*n_jobs")(
            delayed(_one_episode)() for _ in range(n_ep)
        )
                
    return organize_state_val(res)


def add_axex_gridline(fig, x_vals, y_vals, ls_rc):
    for ri, ci in ls_rc:
        # Manually add gridlines at half-tick positions for x-axis
        for x in x_vals:
            fig.add_shape(
                type="line", x0=x, y0=min(y_vals), x1=x, y1=max(y_vals),
                line=dict(color="Grey", width=1, dash="dot"),
                row=ri, col=ci
            )

        # Manually add gridlines at half-tick positions for y-axis
        for y in y_vals:
            fig.add_shape(
                type="line", x0=min(x_vals), y0=y, x1=max(x_vals), y1=y,
                line=dict(color="Grey", width=1, dash="dot"),
                row=ri, col=ci
            )

def plot_state_policy(state_policy, postfix, colorscale=COLORSCALE):
    plot_val_over_state_space("Policy", state_policy, postfix, colorscale=colorscale)

def plot_val_over_state_space(title, val_over_state_space, postfix, zlim=None, colorscale=COLORSCALE):
    fig = make_subplots(rows=1, cols=2, horizontal_spacing=0.2, subplot_titles=('No Ace', 'With Ace'))
    if zlim is not None:
        zmin, zmax = zlim
    else:
        zmin, zmax = min(np.min(vals) for vals in val_over_state_space), max(np.max(vals) for vals in val_over_state_space)
    fig.add_trace(
        go.Heatmap(
            z=val_over_state_space[0].T,
            colorscale=colorscale,
            x=[i for i in range(1, 11)],
            y=[j for j in range(12, 22)],
            hoverongaps=False,
            zmin=zmin,
            zmax=zmax,
            colorbar=dict(title=title, x=0.42)  # Adjust x to position the color bar between subplots
        ), row=1, col=1
    )
    fig.add_trace(
        go.Heatmap(
            z=val_over_state_space[1].T,
            colorscale=colorscale,
            x=[i for i in range(1, 11)],
            y=[j for j in range(12, 22)],
            hoverongaps=False,
            zmin=zmin,
            zmax=zmax,
            colorbar=dict(title=title, x=1.02)  # Adjust x to position the color bar between subplots
        ), row=1, col=2
    )
    
    xaxis_kwargs = dict(
        title='x: Dealer showing', 
        tickvals=np.arange(1, 11),
        ticktext=[str(i) for i in range(1, 11)],
        showgrid=False
        # showgrid=True, gridwidth=1, gridcolor='grey', layer='above traces'
    )
    yaxis_kwargs = dict(
        title='y: Player sum', 
        tickvals=np.arange(12, 22),
        ticktext=[str(i) for i in range(12, 22)],
        showgrid=False
        # showgrid=True, gridwidth=1, gridcolor='grey', layer='above traces'
    )
    
    fig.update_layout(
        title=f'State {title} Heatmaps {postfix}',
        xaxis=xaxis_kwargs,
        yaxis=yaxis_kwargs,
        xaxis2=xaxis_kwargs,
        yaxis2=yaxis_kwargs,
        autosize=False,
        width=1000,
        height=500,
    )
    
    gl_x_vals = np.arange(0.5, 11, 1)
    gl_y_vals = np.arange(11.5, 22, 1)
    add_axex_gridline(fig, gl_x_vals, gl_y_vals, [(1, 1), (1, 2)])

    # add_axex_gridline(fig, np.arange(1, 11), np.arange(12, 22))

    fig.show()

def plot_arr_bj_state_val(state_val_res, postfix, max_cnt=None, colorscale=COLORSCALE):
    state_val_0, state_val_1, state_val_cnt_0, state_val_cnt_1 = state_val_res
    fig = make_subplots(
        rows=2, cols=2, 
        horizontal_spacing=0.2, vertical_spacing=0.2,
        subplot_titles=('No Ace', 'With Ace', 'Count No Ace', 'Count With Ace'))
    fig.add_trace(
        go.Heatmap(
            z=state_val_0.T,
            colorscale=colorscale,
            x=[i for i in range(1, 11)],
            y=[j for j in range(12, 22)],
            hoverongaps=False,
            zmin=-1,
            zmax=1,
            colorbar=dict(title="Val", x=0.42, y=0.82, len=0.4)  # Adjust x to position the color bar between subplots
        ), row=1, col=1
    )
    fig.add_trace(
        go.Heatmap(
            z=state_val_1.T,
            colorscale=colorscale,
            x=[i for i in range(1, 11)],
            y=[j for j in range(12, 22)],
            hoverongaps=False,
            zmin=-1,
            zmax=1,
            colorbar=dict(title="Val", x=1.02, y=0.82, len=0.4)  # Adjust x to position the color bar between subplots
        ), row=1, col=2
    )
    
    fig.add_trace(
        go.Heatmap(
            z=state_val_cnt_0.T,
            colorscale=colorscale,
            x=[i for i in range(1, 11)],
            y=[j for j in range(12, 22)],
            hoverongaps=False,
            zmin=0,
            zmax=max_cnt,
            colorbar=dict(title="Cnt", x=0.42, y=0.2, len=0.4)  # Adjust x to position the color bar between subplots
        ), row=2, col=1
    )
    fig.add_trace(
        go.Heatmap(
            z=state_val_cnt_1.T,
            colorscale=colorscale,
            x=[i for i in range(1, 11)],
            y=[j for j in range(12, 22)],
            hoverongaps=False,
            zmin=0,
            zmax=max_cnt,
            colorbar=dict(title="Cnt", x=1.02, y=0.2, len=0.4)  # Adjust x to position the color bar between subplots
        ), row=2, col=2
    )
    
    xaxis_kwargs = dict(
        title='x: Dealer showing', 
        tickvals=np.arange(1, 11),
        ticktext=[str(i) for i in range(1, 11)],
        showgrid=False
        # showgrid=True, gridwidth=1, gridcolor='grey', layer='above traces'
    )
    yaxis_kwargs = dict(
        title='y: Player sum', 
        tickvals=np.arange(12, 22),
        ticktext=[str(i) for i in range(12, 22)],
        showgrid=False
        # showgrid=True, gridwidth=1, gridcolor='grey', layer='above traces'
    )

    fig.update_layout(
        title=f'State-Value Heatmaps {postfix}',
        xaxis=xaxis_kwargs,
        yaxis=yaxis_kwargs,
        xaxis2=xaxis_kwargs,
        yaxis2=yaxis_kwargs,
        xaxis3=xaxis_kwargs,
        yaxis3=yaxis_kwargs,
        xaxis4=xaxis_kwargs,
        yaxis4=yaxis_kwargs,
        autosize=False,
        width=1000,
        height=1000,
    )
    
    gl_x_vals = np.arange(0.5, 11, 1)
    gl_y_vals = np.arange(11.5, 22, 1)
    add_axex_gridline(fig, gl_x_vals, gl_y_vals, [(1, 1), (1, 2), (2, 1), (2, 2)])

    fig.show()
    
    
def plot_arr_bj_state_val_3d(state_val_arr, postfix):
    # Assuming state_val_arr is your 2D array of state values
    # Generate a meshgrid for your state dimensions
    # For example, if player's sum ranges from 12 to 21 and dealer's showing card from 1 to 10
    player_sum = np.arange(12, 22)  # Player's sum range
    dealer_showing = np.arange(1, 11)  # Dealer's showing card range
    X, Y = np.meshgrid(dealer_showing, player_sum)

    # Create a 3D surface plot
    fig = go.Figure(data=[
        go.Surface(
        z=state_val_arr, x=X, y=Y,
            colorbar=dict(
                title='Value',  # Title of the colorbar
                titleside='right',
                tickmode='array',
                tickvals=[-1, 1],  # Custom tick marks
                ticktext=['-1', '1'],  # Custom tick text
            )    
        )
    ])

    # Customize the layout
    fig.update_layout(
        title=f'State-Value {postfix}', autosize=False,
        scene=dict(
            xaxis_title='x: Dealer Showing',
            yaxis_title='y: Player Sum',
            zaxis=dict(range=(-1, 1), title='z: State Value')
        ),
        # scene=dict(
        #     xaxis=dict(
        #         title='Dealer Showing',
        #         showgrid=True  # Show grid lines on the x-axis
        #     ),
        #     yaxis=dict(
        #         title='Player Sum',
        #         showgrid=True  # Show grid lines on the y-axis
        #     ),
        #     zaxis=dict(
        #         range=(-1, 1),
        #         title='State Value',
        #         showgrid=True  # Show grid lines on the z-axis
        #     )
        # ),
        width=700, height=700,
        margin=dict(l=25, r=20, b=25, t=40),
        scene_camera=dict(
            up=dict(x=0, y=0, z=1),  # Sets the upward direction
            center=dict(x=0, y=0, z=0),  # Centers the view
            eye=dict(x=2, y=-2, z=2)  # Positions the camera view point
        )
    )

    # Show the plot
    fig.show()
In [9]:
ls_n_ep = [10_000, 500_000, 1_000_000]

Func 1

In [12]:
%%time
ls_state_val_1 = []
for n_ep in ls_n_ep:
    state_val_1 = Monte_Carlo_sim_blackjack_1(n_ep=n_ep)
    ls_state_val_1.append(state_val_1)
CPU times: user 2min 25s, sys: 794 ms, total: 2min 26s
Wall time: 2min 25s
In [13]:
for n_ep, state_val_1 in zip(ls_n_ep, ls_state_val_1):
    plot_arr_bj_state_val(state_val_1, f'(n_ep={n_ep})')
In [14]:
%%time
ls_state_val_1_paral = []
for n_ep in ls_n_ep:
    state_val_1_paral = Monte_Carlo_sim_blackjack_1(n_ep=n_ep, n_jobs=N_JOBS, verbose=2)
    ls_state_val_1_paral.append(state_val_1_paral)
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  11 tasks      | elapsed:    0.0s
[Parallel(n_jobs=20)]: Done 446 tasks      | elapsed:    0.1s
[Parallel(n_jobs=20)]: Done 8125 tasks      | elapsed:    0.3s
[Parallel(n_jobs=20)]: Done 10000 out of 10000 | elapsed:    0.5s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  11 tasks      | elapsed:    0.0s
[Parallel(n_jobs=20)]: Done 446 tasks      | elapsed:    0.1s
[Parallel(n_jobs=20)]: Done 16590 tasks      | elapsed:    0.4s
[Parallel(n_jobs=20)]: Done 460876 tasks      | elapsed:    4.1s
[Parallel(n_jobs=20)]: Done 500000 out of 500000 | elapsed:    4.3s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  11 tasks      | elapsed:    0.0s
[Parallel(n_jobs=20)]: Done 446 tasks      | elapsed:    0.1s
[Parallel(n_jobs=20)]: Done 16590 tasks      | elapsed:    0.4s
[Parallel(n_jobs=20)]: Done 466894 tasks      | elapsed:    4.2s
[Parallel(n_jobs=20)]: Done 992595 tasks      | elapsed:    8.3s
[Parallel(n_jobs=20)]: Done 1000000 out of 1000000 | elapsed:    8.4s finished
CPU times: user 8.1 s, sys: 133 ms, total: 8.23 s
Wall time: 14.1 s
In [15]:
for n_ep, state_val_1_paral in zip(ls_n_ep, ls_state_val_1_paral):
    plot_arr_bj_state_val(state_val_1_paral, f'(n_ep={n_ep})')
In [ ]:
 

Func 2

In [16]:
%%time
ls_state_val_2 = []
for n_ep in ls_n_ep:
    state_val_2 = Monte_Carlo_sim_blackjack_2(n_ep=n_ep)
    ls_state_val_2.append(state_val_2)
In [ ]:
for n_ep, state_val_2 in zip(ls_n_ep, ls_state_val_2):
    plot_arr_bj_state_val(state_val_2, f'(n_ep={n_ep})')
In [ ]:
%%time
ls_state_val_2_paral = []
for n_ep in ls_n_ep:
    state_val_2_paral = Monte_Carlo_sim_blackjack_2(n_ep=n_ep, n_jobs=N_JOBS, verbose=2)
    ls_state_val_2_paral.append(state_val_2_paral)
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  11 tasks      | elapsed:    0.0s
[Parallel(n_jobs=20)]: Done 446 tasks      | elapsed:    0.1s
[Parallel(n_jobs=20)]: Done 8125 tasks      | elapsed:    0.3s
[Parallel(n_jobs=20)]: Done 10000 out of 10000 | elapsed:    0.5s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  11 tasks      | elapsed:    0.0s
[Parallel(n_jobs=20)]: Done 446 tasks      | elapsed:    0.1s
[Parallel(n_jobs=20)]: Done 16590 tasks      | elapsed:    0.5s
[Parallel(n_jobs=20)]: Done 462882 tasks      | elapsed:    4.7s
[Parallel(n_jobs=20)]: Done 500000 out of 500000 | elapsed:    4.9s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  11 tasks      | elapsed:    0.0s
[Parallel(n_jobs=20)]: Done 446 tasks      | elapsed:    0.1s
[Parallel(n_jobs=20)]: Done 16590 tasks      | elapsed:    0.5s
[Parallel(n_jobs=20)]: Done 466894 tasks      | elapsed:    4.8s
[Parallel(n_jobs=20)]: Done 992595 tasks      | elapsed:    9.5s
[Parallel(n_jobs=20)]: Done 1000000 out of 1000000 | elapsed:    9.6s finished
CPU times: user 9.53 s, sys: 145 ms, total: 9.68 s
Wall time: 16.3 s
In [ ]:
for n_ep, state_val_2_paral in zip(ls_n_ep, ls_state_val_2_paral):
    plot_arr_bj_state_val(state_val_2_paral, f'(n_ep={n_ep})')

Case 0: dealer_thre=17, player_thre=20

In [ ]:
%%time
state_val_c0 = Monte_Carlo_sim_blackjack_2(n_ep=1_000_000, dealer_thre=17, player_thre=20, n_jobs=N_JOBS, verbose=2)
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  11 tasks      | elapsed:    0.5s
[Parallel(n_jobs=20)]: Done 306 tasks      | elapsed:    0.6s
[Parallel(n_jobs=20)]: Done 11618 tasks      | elapsed:    0.8s
[Parallel(n_jobs=20)]: Done 425954 tasks      | elapsed:    4.9s
[Parallel(n_jobs=20)]: Done 990822 tasks      | elapsed:   10.0s
[Parallel(n_jobs=20)]: Done 1000000 out of 1000000 | elapsed:   10.1s finished
In [ ]:
plot_arr_bj_state_val_3d(state_val_c0[0].T, f'(n_ep=1_000_000), (dealer, player)=(20, 20), no ace)')
plot_arr_bj_state_val_3d(state_val_c0[1].T, f'(n_ep=1_000_000), (dealer, player)=(20, 20), with ace)')
In [ ]:
plot_arr_bj_state_val(state_val_c0, f'(n_ep=1_000_000), (dealer, player)=(17, 20)')

Case 1: dealer_thre=20, player_thre=20

In [ ]:
%%time
state_val_c1 = Monte_Carlo_sim_blackjack_2(n_ep=1_000_000, dealer_thre=20, player_thre=20, n_jobs=N_JOBS, verbose=2)
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  11 tasks      | elapsed:    0.0s
[Parallel(n_jobs=20)]: Done 446 tasks      | elapsed:    0.1s
[Parallel(n_jobs=20)]: Done 16590 tasks      | elapsed:    0.4s
[Parallel(n_jobs=20)]: Done 466894 tasks      | elapsed:    5.0s
[Parallel(n_jobs=20)]: Done 992595 tasks      | elapsed:    9.9s
[Parallel(n_jobs=20)]: Done 1000000 out of 1000000 | elapsed:    9.9s finished
In [ ]:
plot_arr_bj_state_val_3d(state_val_c1[0].T, f'(n_ep=1_000_000, (dealer, player)=(20, 20), no ace)')
plot_arr_bj_state_val_3d(state_val_c1[1].T, f'(n_ep=1_000_000, (dealer, player)=(20, 20), with ace)')
In [ ]:
plot_arr_bj_state_val(state_val_c1, f'(n_ep=1_000_000, dealer_thre=20, player_thre=20)')

Case 2: dealer_thre=17, player_thre=17

In [ ]:
%%time
state_val_c2 = Monte_Carlo_sim_blackjack_2(n_ep=1_000_000, dealer_thre=17, player_thre=17, n_jobs=N_JOBS, verbose=2)
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  11 tasks      | elapsed:    0.0s
[Parallel(n_jobs=20)]: Done 446 tasks      | elapsed:    0.1s
[Parallel(n_jobs=20)]: Done 16590 tasks      | elapsed:    0.5s
[Parallel(n_jobs=20)]: Done 274382 tasks      | elapsed:    3.1s
[Parallel(n_jobs=20)]: Done 648142 tasks      | elapsed:    6.9s
[Parallel(n_jobs=20)]: Done 993020 tasks      | elapsed:   10.3s
[Parallel(n_jobs=20)]: Done 1000000 out of 1000000 | elapsed:   10.3s finished
In [ ]:
plot_arr_bj_state_val_3d(state_val_c2[0].T, f'(n_ep=1_000_000, (dealer, player)=(17, 17), no ace)')
plot_arr_bj_state_val_3d(state_val_c2[1].T, f'(n_ep=1_000_000, (dealer, player)=(17, 17), with ace)')
In [ ]:
plot_arr_bj_state_val(state_val_c2, f'(n_ep=1_000_000, dealer_thre=17, player_thre=17)')

Case 3: dealer_thre=20, player_thre=17

In [ ]:
%%time
state_val_c3 = Monte_Carlo_sim_blackjack_2(n_ep=1_000_000, dealer_thre=20, player_thre=17, n_jobs=N_JOBS, verbose=2)
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  11 tasks      | elapsed:    0.0s
[Parallel(n_jobs=20)]: Done 446 tasks      | elapsed:    0.1s
[Parallel(n_jobs=20)]: Done 16590 tasks      | elapsed:    0.5s
[Parallel(n_jobs=20)]: Done 466894 tasks      | elapsed:    5.1s
[Parallel(n_jobs=20)]: Done 992595 tasks      | elapsed:   10.1s
[Parallel(n_jobs=20)]: Done 1000000 out of 1000000 | elapsed:   10.1s finished
In [ ]:
plot_arr_bj_state_val_3d(state_val_c3[0].T, f'(n_ep=1_000_000, (dealer, player)=(20, 17), no ace)')
plot_arr_bj_state_val_3d(state_val_c3[1].T, f'(n_ep=1_000_000, (dealer, player)=(20, 17), with ace)')
In [ ]:
plot_arr_bj_state_val(state_val_c3, f'(n_ep=1_000_000, dealer_thre=20, player_thre=17)')

Example 5.2: Blackjack optimal policy

  • Using Monte Carlo ES (Exploring Starts)
  • The optimal policy is a threshold policy. Meaning, if a player decides to stick at certain state, it will not hit when reaching state higher than the threshold. This is can be proved by the following.
In [ ]:
FLOAT_TOL = 1e-8
REL_ABS_DIFF_RD_HL_MUL = 5

def Monte_Carlo_ES_blackjack_1(
    n_ep, n_paral, dealer_thre=17, ini_player_thre=20, dealer_policy="fixed",
    pol_impr=True, tol=1e-6, show_rdis=None, n_jobs=1, verbose=0
):
    assert dealer_thre<=21
    assert ini_player_thre<=21
    assert n_ep % n_paral == 0
    assert n_paral >= n_jobs
    
    n_suit = 13
    ndl, npl = 10, 10
    d_s0, p_s0 = 1, 12
    state_act_val_sum = np.zeros((ndl, npl, 2, 2))
    state_act_val_cnt = np.ones((ndl, npl, 2, 2)) # So cnt is never 0 and can always be at the denominator
    state_act_val = np.zeros((ndl, npl, 2, 2))
    state_policy = np.zeros((ndl, npl, 2), dtype=int) # be careful about dtype here, if we need to use it as index, it needs to be int
    for i in range(ndl): # Dealer showing
        for j in range(npl): # Player sum
            for k in range(0, 2): # Usable ace
                # State
                if j+p_s0<ini_player_thre:
                    # Initial policy
                    # Whenever the player sum is smaller than ini_player_thre, hit 
                    state_policy[i, j, k] = 1
    
    def _ini_card_val(card):
        return int(min(card, 10)+10*(card==1))
    
    def _card_val(card):
        return int(min(card, 10))
    
    def _organize_state_val(state_act_val, state_act_val_cnt, state_policy):
        state_val = np.max(state_act_val, axis=-1)
        state_val_cnt = np.sum(state_act_val_cnt, axis=-1)
        state_val_0 = state_val[:, :, 0]
        state_val_1 = state_val[:, :, 1]
        state_val_cnt_0 = state_val_cnt[:, :, 0]
        state_val_cnt_1 = state_val_cnt[:, :, 1]
        state_policy_0 = state_policy[:, :, 0]
        state_policy_1 = state_policy[:, :, 1]
        state_act_val_00 = state_act_val[:, :, 0, 0]
        state_act_val_01 = state_act_val[:, :, 0, 1]
        state_act_val_10 = state_act_val[:, :, 1, 0]
        state_act_val_11 = state_act_val[:, :, 1, 1]
        state_act_val_cnt_00 = state_act_val_cnt[:, :, 0, 0]
        state_act_val_cnt_01 = state_act_val_cnt[:, :, 0, 1]
        state_act_val_cnt_10 = state_act_val_cnt[:, :, 1, 0]
        state_act_val_cnt_11 = state_act_val_cnt[:, :, 1, 1]
        
        return (
            state_val_0, state_val_1, state_val_cnt_0, state_val_cnt_1, state_policy_0, state_policy_1,
            (state_act_val_00, state_act_val_10, state_act_val_cnt_00, state_act_val_cnt_10),
            (state_act_val_01, state_act_val_11, state_act_val_cnt_01, state_act_val_cnt_11)
        )
    
    def _dealer_turn_fixed(dealer_show, final_player_sum):
        assert final_player_sum<=21
        dealer_sum = _ini_card_val(dealer_show)
        had_ace = d_usable_ace = int(dealer_show == 1)
        while 1:
            new_card = np.random.randint(1, 1+n_suit)
            if new_card == 1:
                if had_ace: # if already had an ace or previously had an ace, but not usable now
                    dealer_sum += 1
                else:
                    had_ace = d_usable_ace = 1
                    dealer_sum += _ini_card_val(new_card)
            else:
                dealer_sum += _card_val(new_card)
            if dealer_sum > 21:
                if d_usable_ace:
                    d_usable_ace = 0
                    dealer_sum -= 10
                else: # dealer busted
                    reward = 1 # player reward
                    break
            elif dealer_sum>=dealer_thre:
                reward = np.sign(final_player_sum-dealer_sum)
                break
        return reward
    
    def _dealer_turn_smart(dealer_show, final_player_sum):
        assert final_player_sum<=21
        dealer_sum = _ini_card_val(dealer_show)
        had_ace = d_usable_ace = int(dealer_show == 1)
        while 1:
            new_card = np.random.randint(1, 1+n_suit)
            if new_card == 1:
                if had_ace: # if already had an ace or previously had an ace, but not usable now
                    dealer_sum += 1
                else:
                    had_ace = d_usable_ace = 1
                    dealer_sum += _ini_card_val(new_card)
            else:
                dealer_sum += _card_val(new_card)
            if dealer_sum > 21:
                if d_usable_ace:
                    d_usable_ace = 0
                    dealer_sum -= 10
                else: # dealer busted
                    reward = 1 # player reward
                    break
            elif dealer_sum > final_player_sum or dealer_sum==21 or dealer_sum>=dealer_thre:
                reward = np.sign(final_player_sum-dealer_sum)
                break
        return reward
    
    def _one_episode():
        ls_state_act = []
        # initial state-act
        cur_state_act = (
            np.random.randint(ndl),
            np.random.randint(npl),
            np.random.randint(2),
            np.random.randint(2), # Also need to randomly choose action, b/c we need some experience to evaluate a state-act pair.
        )
        ls_state_act.append(cur_state_act)
        dealer_show = cur_state_act[0] + d_s0 # Convert from dealer state idx to actual dealer showing card
        
        busted = False
        while cur_state_act[-1]: # As long as current act is to hit
            new_card = np.random.randint(1, 1+n_suit)
            player_sum = cur_state_act[1] + p_s0 + _card_val(new_card) # Convert from player state idx to actual player sum
            if player_sum>21: 
                if cur_state_act[2]:
                    player_sum -= 10
                    cur_state_act = [cur_state_act[0], player_sum-p_s0, 0] # Convert from actual player sum to player state idx
                else: # player busted
                    busted = True
                    break
            else:
                cur_state_act = [cur_state_act[0], player_sum-p_s0, cur_state_act[2]]
            policy_act = int(state_policy[tuple(cur_state_act)]) # Follow the current policy
            cur_state_act.append(policy_act)
            ls_state_act.append(tuple(cur_state_act))
        
        if busted:
            reward = -1
        else:
            final_player_sum = cur_state_act[1] + p_s0
            if dealer_policy == "fixed":
                reward = _dealer_turn_fixed(dealer_show, final_player_sum)
            else:
                reward = _dealer_turn_smart(dealer_show, final_player_sum)
        
        state_act_val_sum = np.zeros((ndl, npl, 2, 2))
        state_act_val_cnt = np.zeros((ndl, npl, 2, 2))
        
        for state_act in ls_state_act:
            assert isinstance(state_act, tuple) and len(state_act)==4
            state_act_val_sum[state_act] += reward
            state_act_val_cnt[state_act] += 1
        
        return state_act_val_sum, state_act_val_cnt
            
    n_rd = n_ep // n_paral
    
    # def _get_state_act_val(i, j, k, a):
    #     state_act_val = (
    #         state_act_val_sum[i,j,k,a]/state_act_val_cnt[i,j,k,a] 
    #         if state_act_val_cnt[i,j,k,a]>0
    #         else 1 # Can actually choose any value. Choose 1 to promote exploring zero experience state-act
    #     )
    #     return state_act_val
    
    log_step = n_rd//20
    rel_abs_diff_rd_hl = REL_ABS_DIFF_RD_HL_MUL*state_act_val.size/n_paral
    lam = np.exp(-np.log(2)/rel_abs_diff_rd_hl)
    # norm_state_act_val_abs_diff = np.ones((ndl, npl, 2, 2))/state_act_val.size
    norm_state_act_val_act_similar = np.ones((ndl, npl, 2))/state_policy.size
    
    for rdi in range(n_rd):
        if rdi%log_step==0:
            logger.info("="*20+f"round: {rdi}")
            
        state_act_val_prev = state_act_val.copy()
        state_act_val = state_act_val_sum/state_act_val_cnt
        state_act_val_abs_diff = np.abs(state_act_val-state_act_val_prev) 
        abs_state_act_val = np.abs(state_act_val)           
        max_abs_diff, max_abs_val = np.max(state_act_val_abs_diff), np.max(abs_state_act_val)
        if rdi>0:
            if max_abs_diff<tol*max_abs_val:
                logger.info(f"Converged at round with relative tol {tol:.1e} : {rdi}")
                break
            if pol_impr:
                state_policy = np.argmax(state_act_val, axis=-1)
            # Show the normalized relative absolute difference to see how state-val changes across state space
            # n_norm_state_act_val_abs_diff = state_act_val_abs_diff/np.maximum(abs_state_act_val, (abs_state_act_val<FLOAT_TOL).astype(int))
            # n_norm_state_act_val_abs_diff = n_norm_state_act_val_abs_diff/np.sum(n_norm_state_act_val_abs_diff)
            # norm_state_act_val_abs_diff += (1-lam)*(n_norm_state_act_val_abs_diff-norm_state_act_val_abs_diff)
            # Show the normalized relative similarity of action values for ech state.
            abs_state_act_val_sum = np.sum(abs_state_act_val, axis=-1)
            n_norm_state_act_val_act_similar = (
                2/(1+(
                    np.abs(np.diff(state_act_val, axis=-1).squeeze(axis=-1))/
                    np.maximum(abs_state_act_val_sum, (abs_state_act_val_sum<FLOAT_TOL).astype(int))
                ))-1
            )
            n_norm_state_act_val_act_similar /= np.sum(n_norm_state_act_val_act_similar)
            norm_state_act_val_act_similar += (1-lam)*(n_norm_state_act_val_act_similar-norm_state_act_val_act_similar) 
            
        if show_rdis is not None and rdi in show_rdis:
            logger.info("-"*20+f"round: {rdi}")
            tmp_res = _organize_state_val(state_act_val, state_act_val_cnt, state_policy)
            plot_state_policy([tmp_res[4], tmp_res[5]], f'{n_ep}, ({dealer_thre}, {ini_player_thre}), rdi={rdi}, {max_abs_diff:.2e}, {max_abs_diff/max_abs_val:.2e}')
            if rdi>0:
                # plot_val_over_state_space(
                #     "Rel-Abs-Diff", 
                #     [norm_state_act_val_abs_diff[:, :, 0, 0], norm_state_act_val_abs_diff[:, :, 1, 0]],
                #     f'{n_ep}, ({dealer_thre}, {ini_player_thre}), rdi={rdi}, stick',
                #     zlim=(0, 1)
                # )
                # plot_val_over_state_space(
                #     "Rel-Abs-Diff", 
                #     [norm_state_act_val_abs_diff[:, :, 0, 1], norm_state_act_val_abs_diff[:, :, 1, 1]],
                #     f'{n_ep}, ({dealer_thre}, {ini_player_thre}), rdi={rdi}, hit',
                #     zlim=(0, 1)
                # )
                plot_val_over_state_space(
                    "Rel-Act-Simil", 
                    [n_norm_state_act_val_act_similar[:, :, 0], n_norm_state_act_val_act_similar[:, :, 1]],
                    f'{n_ep}, ({dealer_thre}, {ini_player_thre}), rdi={rdi}, stick v.s. hit',
                    # zlim=(0, 1)
                )
            plot_arr_bj_state_val(tmp_res[:4], f'{n_ep}, ({dealer_thre}, {ini_player_thre})', max_cnt=int((n_ep//state_policy.size)*1.5))
            
        with parallel_backend('loky', n_jobs=n_jobs):
            res = Parallel(verbose=verbose, pre_dispatch="1.5*n_jobs")(
                delayed(_one_episode)() for _ in range(n_paral)
            )
        ls_new_val_sum, ls_new_val_cnt = zip(*res)
        new_val_sum = sum(ls_new_val_sum)
        new_val_cnt = sum(ls_new_val_cnt)
        state_act_val_sum += new_val_sum
        state_act_val_cnt += new_val_cnt
            
    state_act_val = state_act_val_sum/state_act_val_cnt
    
    return _organize_state_val(state_act_val, state_act_val_cnt, state_policy)
In [ ]:
show_rdis = [0, 1000, 5000, 10_000, 20_000, 30_000, 40_000, 50_000, 60_000, 70_000, 80_000, 90_000, 100_000, 500_000, 1_000_000]

Dealer policy: "fixed"

In [ ]:
%%time
# 46 mins
n_ep=4_000_000
state_act_res = Monte_Carlo_ES_blackjack_1(
    n_ep=n_ep, n_paral=50, dealer_thre=17, ini_player_thre=20, 
    pol_impr=True, show_rdis=show_rdis, n_jobs=5, verbose=0
)
2024-07-14 09:07:25,736|(/tmp/ipykernel_37034/2582746201.py)[177]: ====================round: 0
2024-07-14 09:07:25,737|(/tmp/ipykernel_37034/2582746201.py)[206]: --------------------round: 0
/tmp/ipykernel_37034/2582746201.py:208: RuntimeWarning:

invalid value encountered in scalar divide

2024-07-14 09:08:09,524|(/tmp/ipykernel_37034/2582746201.py)[206]: --------------------round: 1000
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
File <timed exec>:3

Cell In[45], line 231, in Monte_Carlo_ES_blackjack_1(n_ep, n_paral, dealer_thre, ini_player_thre, dealer_policy, pol_impr, tol, show_rdis, n_jobs, verbose)
    228     plot_arr_bj_state_val(tmp_res[:4], f'{n_ep}, ({dealer_thre}, {ini_player_thre})', max_cnt=int((n_ep//state_policy.size)*1.5))
    230 with parallel_backend('loky', n_jobs=n_jobs):
--> 231     res = Parallel(verbose=verbose, pre_dispatch="1.5*n_jobs")(
    232         delayed(_one_episode)() for _ in range(n_paral)
    233     )
    234 ls_new_val_sum, ls_new_val_cnt = zip(*res)
    235 new_val_sum = sum(ls_new_val_sum)

File ~/.conda/envs/p311/lib/python3.11/site-packages/joblib/parallel.py:2007, in Parallel.__call__(self, iterable)
   2001 # The first item from the output is blank, but it makes the interpreter
   2002 # progress until it enters the Try/Except block of the generator and
   2003 # reaches the first `yield` statement. This starts the asynchronous
   2004 # dispatch of the tasks to the workers.
   2005 next(output)
-> 2007 return output if self.return_generator else list(output)

File ~/.conda/envs/p311/lib/python3.11/site-packages/joblib/parallel.py:1650, in Parallel._get_outputs(self, iterator, pre_dispatch)
   1647     yield
   1649     with self._backend.retrieval_context():
-> 1650         yield from self._retrieve()
   1652 except GeneratorExit:
   1653     # The generator has been garbage collected before being fully
   1654     # consumed. This aborts the remaining tasks if possible and warn
   1655     # the user if necessary.
   1656     self._exception = True

File ~/.conda/envs/p311/lib/python3.11/site-packages/joblib/parallel.py:1762, in Parallel._retrieve(self)
   1757 # If the next job is not ready for retrieval yet, we just wait for
   1758 # async callbacks to progress.
   1759 if ((len(self._jobs) == 0) or
   1760     (self._jobs[0].get_status(
   1761         timeout=self.timeout) == TASK_PENDING)):
-> 1762     time.sleep(0.01)
   1763     continue
   1765 # We need to be careful: the job list can be filling up as
   1766 # we empty it and Python list are not thread-safe by
   1767 # default hence the use of the lock

KeyboardInterrupt: 
In [ ]:
plot_arr_bj_state_val_3d(state_act_res[0].T, f'(n_ep={n_ep}, (dealer, player_ini)=(17, 20), no ace)')
plot_arr_bj_state_val_3d(state_act_res[1].T, f'(n_ep={n_ep}, (dealer, player_ini)=(17, 20), with ace)')
In [ ]:
plot_state_policy([state_act_res[4], state_act_res[5]], f'(n_ep={n_ep}, (dealer, player_ini)=(17, 20))')
In [ ]:
plot_arr_bj_state_val(state_act_res[6], f'(n_ep={n_ep}, (dealer, player_ini)=(17, 20)), stick')
In [ ]:
plot_arr_bj_state_val(state_act_res[7], f'(n_ep={n_ep}, (dealer, player_ini)=(17, 20)), hit')

Dealer policy: "smart"

In [ ]:
%%time
# 49 mins
n_ep=4_000_000
state_act_res = Monte_Carlo_ES_blackjack_1(
    n_ep=n_ep, n_paral=50, dealer_thre=17, ini_player_thre=20, dealer_policy='smart',
    pol_impr=True, show_rdis=show_rdis, n_jobs=5, verbose=0
)
2024-07-14 07:14:44,268|(/tmp/ipykernel_37034/2682771488.py)[176]: ====================round: 0
2024-07-14 07:14:44,269|(/tmp/ipykernel_37034/2682771488.py)[195]: --------------------round: 0
/tmp/ipykernel_37034/2682771488.py:197: RuntimeWarning:

invalid value encountered in scalar divide

2024-07-14 07:15:21,687|(/tmp/ipykernel_37034/2682771488.py)[195]: --------------------round: 1000
2024-07-14 07:17:11,746|(/tmp/ipykernel_37034/2682771488.py)[176]: ====================round: 4000
2024-07-14 07:17:47,663|(/tmp/ipykernel_37034/2682771488.py)[195]: --------------------round: 5000
2024-07-14 07:19:37,909|(/tmp/ipykernel_37034/2682771488.py)[176]: ====================round: 8000
2024-07-14 07:20:50,030|(/tmp/ipykernel_37034/2682771488.py)[195]: --------------------round: 10000
2024-07-14 07:22:04,407|(/tmp/ipykernel_37034/2682771488.py)[176]: ====================round: 12000
2024-07-14 07:24:29,127|(/tmp/ipykernel_37034/2682771488.py)[176]: ====================round: 16000
2024-07-14 07:26:54,863|(/tmp/ipykernel_37034/2682771488.py)[176]: ====================round: 20000
2024-07-14 07:26:54,864|(/tmp/ipykernel_37034/2682771488.py)[195]: --------------------round: 20000
2024-07-14 07:29:23,036|(/tmp/ipykernel_37034/2682771488.py)[176]: ====================round: 24000
2024-07-14 07:31:49,331|(/tmp/ipykernel_37034/2682771488.py)[176]: ====================round: 28000
2024-07-14 07:33:02,555|(/tmp/ipykernel_37034/2682771488.py)[195]: --------------------round: 30000
2024-07-14 07:34:18,384|(/tmp/ipykernel_37034/2682771488.py)[176]: ====================round: 32000
2024-07-14 07:36:45,726|(/tmp/ipykernel_37034/2682771488.py)[176]: ====================round: 36000
2024-07-14 07:39:13,533|(/tmp/ipykernel_37034/2682771488.py)[176]: ====================round: 40000
2024-07-14 07:39:13,534|(/tmp/ipykernel_37034/2682771488.py)[195]: --------------------round: 40000
2024-07-14 07:41:44,293|(/tmp/ipykernel_37034/2682771488.py)[176]: ====================round: 44000
2024-07-14 07:44:13,667|(/tmp/ipykernel_37034/2682771488.py)[176]: ====================round: 48000
2024-07-14 07:45:28,280|(/tmp/ipykernel_37034/2682771488.py)[195]: --------------------round: 50000
2024-07-14 07:46:45,243|(/tmp/ipykernel_37034/2682771488.py)[176]: ====================round: 52000
2024-07-14 07:49:14,956|(/tmp/ipykernel_37034/2682771488.py)[176]: ====================round: 56000
2024-07-14 07:51:45,341|(/tmp/ipykernel_37034/2682771488.py)[176]: ====================round: 60000
2024-07-14 07:51:45,342|(/tmp/ipykernel_37034/2682771488.py)[195]: --------------------round: 60000
2024-07-14 07:54:18,870|(/tmp/ipykernel_37034/2682771488.py)[176]: ====================round: 64000
2024-07-14 07:56:50,563|(/tmp/ipykernel_37034/2682771488.py)[176]: ====================round: 68000
2024-07-14 07:58:06,996|(/tmp/ipykernel_37034/2682771488.py)[195]: --------------------round: 70000
2024-07-14 07:59:25,749|(/tmp/ipykernel_37034/2682771488.py)[176]: ====================round: 72000
2024-07-14 08:01:59,287|(/tmp/ipykernel_37034/2682771488.py)[176]: ====================round: 76000
In [ ]:
plot_arr_bj_state_val_3d(state_act_res[0].T, f'(n_ep={n_ep}, (dealer, player_ini)=(17, 20), no ace)')
plot_arr_bj_state_val_3d(state_act_res[1].T, f'(n_ep={n_ep}, (dealer, player_ini)=(17, 20), with ace)')
In [ ]:
plot_state_policy([state_act_res[4], state_act_res[5]], f'(n_ep={n_ep}, (dealer, player_ini)=(17, 20))')
In [ ]:
plot_arr_bj_state_val(state_act_res[6], f'(n_ep={n_ep}, (dealer, player_ini)=(17, 20)), stick')
In [ ]:
plot_arr_bj_state_val(state_act_res[7], f'(n_ep={n_ep}, (dealer, player_ini)=(17, 20)), hit')
In [ ]:
 
In [ ]:
arr = np.arange(12).reshape(2,3,2)
arr
Out[ ]:
array([[[ 0,  1],
        [ 2,  3],
        [ 4,  5]],

       [[ 6,  7],
        [ 8,  9],
        [10, 11]]])
In [ ]:
np.diff(arr, axis=-1).squeeze(axis=-1)
Out[ ]:
array([[1, 1, 1],
       [1, 1, 1]])
In [ ]:
np.max(arr, axis=-1)
Out[ ]:
array([[ 1,  3,  5],
       [ 7,  9, 11]])